void npu_vec_dot_relu(int16_t *a, int16_t *b, int len, int32_t *out) {
  NPU->SRC_A = (uint32_t)a;
  NPU->SRC_B = (uint32_t)b;
  NPU->DST   = (uint32_t)out;
  NPU->CTRL  = (len << 16) | 1;     // start
  while (!(NPU->CTRL & 2));         // wait done
  *out = NPU->RESULT;
}